In [8]:
# FIRST STEPS IN PYTHON. VARIABLES, OPERATIONS, DATA SETS, PLOTS
# Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# Vectors and simple operations
x = [1, 3, 5, 7, 9] # Create a vector and print it
x
Out[8]:
[1, 3, 5, 7, 9]
In [10]:
print(x)
[1, 3, 5, 7, 9]
In [13]:
print(x[1]) # Get the 2nd element of x.
# In Python, indexing starts at 0, so x[1] is the 2nd element
3
In [15]:
# Arithmetic operations
x+x # Apparently, it is concatenation, not an addition!
Out[15]:
[1, 3, 5, 7, 9, 1, 3, 5, 7, 9]
In [17]:
3*x # Same result, this is also concatenation!
Out[17]:
[1, 3, 5, 7, 9, 1, 3, 5, 7, 9, 1, 3, 5, 7, 9]
In [19]:
squared_x = [y ** 2 for y in x]
print(squared_x)
[1, 9, 25, 49, 81]
In [21]:
log_x = [__import__('math').log(y) if y > 0 else float('nan') for y in x]
print(log_x)
[0.0, 1.0986122886681098, 1.6094379124341003, 1.9459101490553132, 2.1972245773362196]
In [23]:
# Basic statistics
mean_x = sum(x) / len(x) # Mean
print(mean_x)
5.0
In [25]:
sd_x = (sum((y - mean_x) ** 2 for y in x) / len(x)) ** (1/2)
print(sd_x) # Standard deviation
2.8284271247461903
In [27]:
# This is too cumbersome! Instead, we’ll use Python libraries.
# The first one is “numpy” = Numerical Python
# Numerical Python
import numpy as np # Now we can use an abbreviation np
x = np.array([1, 3, 5, 7, 9]) # Define x as an array
x+x # Standard arithmetic on arrays
Out[27]:
array([ 2, 6, 10, 14, 18])
In [29]:
3*x
Out[29]:
array([ 3, 9, 15, 21, 27])
In [31]:
x**2
Out[31]:
array([ 1, 9, 25, 49, 81])
In [33]:
np.log(x)
Out[33]:
array([0. , 1.09861229, 1.60943791, 1.94591015, 2.19722458])
In [35]:
A = np.array([[1, 3, 5], [6, 8, 10]]) # Matrix 2x3
A
Out[35]:
array([[ 1, 3, 5], [ 6, 8, 10]])
In [37]:
# Generate data, an array of Normal random numbers
Z = np.random.normal(0,1,100) # mean, standard deviation, and sample size
In [39]:
A = np.matrix('1,3,5; 6,8,10') # Same result using np.matrix
A
Out[39]:
matrix([[ 1, 3, 5], [ 6, 8, 10]])
In [41]:
np.mean(Z)
Out[41]:
-0.10603236278254234
In [43]:
np.std(Z)
Out[43]:
1.082144924956909
In [45]:
Z.mean() # Another way of calculating the sample mean
Out[45]:
-0.10603236278254234
In [51]:
# Read data from an external file
# To point to the right folder, use os module
import os
print(os.getcwd()) # Get current working directory
C:\Users\baron\Documents\Teach\627 Statistical Machine Learning\Data
In [49]:
os.chdir("C:\\Users\\baron\\Documents\\Teach\\627 Statistical Machine Learning\\Data") # Change the working directory
In [53]:
# Use pandas to read files
import pandas as pd
auto = pd.read_csv("Auto.csv") # Reading a comma-separated values file
In [55]:
# Find out the dimensions and variables of the data set
print(auto.shape) # Number of rows and columns
(397, 9)
In [57]:
print(auto.columns) # Variable names
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name'], dtype='object')
In [59]:
print(auto.describe()) # Summary statistics
mpg cylinders displacement weight acceleration \ count 397.000000 397.000000 397.000000 397.000000 397.000000 mean 23.515869 5.458438 193.532746 2970.261965 15.555668 std 7.825804 1.701577 104.379583 847.904119 2.749995 min 9.000000 3.000000 68.000000 1613.000000 8.000000 25% 17.500000 4.000000 104.000000 2223.000000 13.800000 50% 23.000000 4.000000 146.000000 2800.000000 15.500000 75% 29.000000 8.000000 262.000000 3609.000000 17.100000 max 46.600000 8.000000 455.000000 5140.000000 24.800000 year origin count 397.000000 397.000000 mean 75.994962 1.574307 std 3.690005 0.802549 min 70.000000 1.000000 25% 73.000000 1.000000 50% 76.000000 1.000000 75% 79.000000 2.000000 max 82.000000 3.000000
In [61]:
# Look at the data as a spreadsheet
auto.head() # Show first 5 rows
Out[61]:
mpg | cylinders | displacement | horsepower | weight | acceleration | year | origin | name | |
---|---|---|---|---|---|---|---|---|---|
0 | 18.0 | 8 | 307.0 | 130 | 3504 | 12.0 | 70 | 1 | chevrolet chevelle malibu |
1 | 15.0 | 8 | 350.0 | 165 | 3693 | 11.5 | 70 | 1 | buick skylark 320 |
2 | 18.0 | 8 | 318.0 | 150 | 3436 | 11.0 | 70 | 1 | plymouth satellite |
3 | 16.0 | 8 | 304.0 | 150 | 3433 | 12.0 | 70 | 1 | amc rebel sst |
4 | 17.0 | 8 | 302.0 | 140 | 3449 | 10.5 | 70 | 1 | ford torino |
In [63]:
# Refer to a particular variable in this dataset
print(auto['name'])
0 chevrolet chevelle malibu 1 buick skylark 320 2 plymouth satellite 3 amc rebel sst 4 ford torino ... 392 ford mustang gl 393 vw pickup 394 dodge rampage 395 ford ranger 396 chevy s-10 Name: name, Length: 397, dtype: object
In [65]:
print(auto['mpg'].mean()) # Mean of 'mpg'
23.51586901763224
In [67]:
print(auto['mpg'].describe()) # Summary statistics of 'mpg'
count 397.000000 mean 23.515869 std 7.825804 min 9.000000 25% 17.500000 50% 23.000000 75% 29.000000 max 46.600000 Name: mpg, dtype: float64
In [69]:
# PLOTS. Before you do anything with the data, look at them. Use the matplotlib library.
import matplotlib.pyplot as plt
plt.scatter(auto['weight'], auto['mpg']) # Scatterplot
# Axis labels, graph title, color
plt.scatter(auto['weight'], auto['mpg'], color='green')
plt.xlabel('Weight')
plt.ylabel('MPG')
plt.title('Plot of Miles per Gallon')
plt.show()
In [196]:
plt.scatter(auto['cylinders'], auto['mpg']) # Another scatterplot
plt.xlabel('Cylinders')
plt.ylabel('MPG')
plt.show()
In [186]:
# Treat “cylinders” as a categorical variable => Python creates boxplots
auto['cylinders'] = auto['cylinders'].astype('category')
auto.boxplot(column='mpg', by='cylinders')
plt.show()
In [202]:
# SCATTERPLOT MATRIX
pd.plotting.scatter_matrix(auto[['mpg', 'weight', 'horsepower', 'year']], figsize=(6,6))
plt.show() # Histograms on the diagonal, scatterplots of the corresponding variables elsewhere